import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.offline as pyo
import plotly.io as pio
import plotly.graph_objs as go
from datetime import date
from urllib.request import urlopen
import json
from jupyter_dash import JupyterDash
from dash.dependencies import Input, Output
import dash_html_components as html
import dash_core_components as dcc
import warnings
warnings.filterwarnings('ignore')
from IPython.display import HTML
#https://stackoverflow.com/questions/27934885/how-to-hide-code-from-cells-in-ipython-notebook-visualized-with-nbviewer
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<style>
.output_png {
display: table-cell;
text-align: center;
vertical-align: middle;
}
</style>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the code."></form>''')







"""
Function to web scrape most up to date data from usafacts website so that the code
is always up to date with the most recent COVID-19 information
"""
def scrape_live_data():
# Load datasets dynamically from URLs
confirm_df = pd.read_csv('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_confirmed_usafacts.csv', header=0)
# , encoding = "ISO-8859-1"
pop_df = pd.read_csv('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_county_population_usafacts.csv', header=0)
# Combine into one dataset
confirmed_cases = pd.merge(left=confirm_df, right=pop_df, how='left', left_on=['countyFIPS','State'], right_on=['countyFIPS','State'])
# Rename/Drop/Move columns
# get list of column names
all_columns = confirmed_cases.columns.values
# rename county Name_x with county name
all_columns[1] = 'County Name'
# make updated column names list columns of dataframe
confirmed_cases.columns = all_columns
# drop additional county names column added on merge
confirmed_cases.drop(labels=['County Name_y'], axis=1, inplace = True)
# save the population column so we can add back into df later
population_column = confirmed_cases['population']
# drop population column from dataframe
confirmed_cases.drop(labels=['population'], axis=1, inplace = True)
# insert population column back in as 3rd column in df
confirmed_cases.insert(3, 'population', population_column)
indexes = confirmed_cases.loc[[1835, 1862, 1863]].index
confirmed_cases.drop(indexes, inplace = True)
return confirmed_cases
def scrape_live_data_deaths():
# Load datasets dynamically from URLs
death_df = pd.read_csv('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_deaths_usafacts.csv', header=0)
# , encoding = "ISO-8859-1"
pop_df = pd.read_csv('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_county_population_usafacts.csv', header=0)
# Combine into one dataset
confirmed_deaths = pd.merge(left=death_df, right=pop_df, how='left', left_on=['countyFIPS','State'], right_on=['countyFIPS','State'])
# Rename/Drop/Move columns
# get list of column names
all_columns = confirmed_deaths.columns.values
# rename county Name_x with county name
all_columns[1] = 'County Name'
# make updated column names list columns of dataframe
confirmed_deaths.columns = all_columns
# drop additional county names column added on merge
confirmed_deaths.drop(labels=['County Name_y'], axis=1, inplace = True)
# save the population column so we can add back into df later
population_column = confirmed_deaths['population']
# drop population column from dataframe
confirmed_deaths.drop(labels=['population'], axis=1, inplace = True)
# insert population column back in as 3rd column in df
confirmed_deaths.insert(3, 'population', population_column)
indexes = confirmed_deaths.loc[[1835, 1862, 1863]].index
confirmed_deaths.drop(indexes, inplace = True)
return confirmed_deaths
def exploratoryDA(confirmed_cases):
#checking first few rows to get a sense of data
print("##############print first 10 rows:#################")
print(confirmed_cases.head(10))
#checking last ten rows
print("##############print last 10 rows:(##############")
print(confirmed_cases.tail(10))
#checking the data types of the columns
print("##############print column data types:##############")
print(confirmed_cases.dtypes)
#Gettign a sense of count
print("#############print column counts:##############")
print(confirmed_cases.count())
#checking for nulls
print("##############Check nulls:##############")
print(confirmed_cases.isnull().sum()) (edited)
#arrange columns
def createStatePOP(confirmed_cases):
#create a pivot table to group data by state to calcualte state population
df_statepop = confirmed_cases.pivot_table(['population'],['State'],aggfunc='sum').reset_index()
return df_statepop
def meltData(confirmed_cases):
#Unpivot date columns with variable column ‘Date’ and value column ‘Confirmed’
#saving all dates in a list to use for melting the data frame
dates = confirmed_cases.columns[6:]
df_confirm_long = confirmed_cases.melt(id_vars=['State'], value_vars=dates, var_name='Date', value_name='Confirmed')
df_confirm_long.to_csv('df_confirm_long.csv',index=True)
#make sure the confirmed cases are numeric and not string
df_confirm_long['Confirmed'] = pd.to_numeric(df_confirm_long['Confirmed'],errors='coerce')
#aggregate by state and get max confirmed cases to date
last_date= dates[-1]
df_confirm_long = df_confirm_long[df_confirm_long['Date']==last_date].groupby(['State'])['Confirmed'].sum().reset_index()
return df_confirm_long
#need to run after merge, arrange, melt
def createRate(df_statepop,df2_melt):
#merge both population and confirmed cases to calculate rate
df2_melt = pd.merge(left=df2_melt, right=df_statepop, how='left', left_on=['State'], right_on=['State'])
df2_melt['Rate'] = (df2_melt['Confirmed']/df2_melt['population'] ) *100
return df2_melt
#need to run after merge, arrange, melt, create rate
#Plot geo map for statewie rate = cases/popualtion
def createUSAMap(createRate):
map_data= dict(type='choropleth',
colorscale='Portland',
locations = createRate['State'],
locationmode ='USA-states',
z = createRate['Rate'],
text = createRate['State'],
colorbar = {'title': 'Cases/Population%'},
marker = dict(line = dict(color = 'rgb(255,255,255)', width=2))
)
map_layout= dict(title = 'Percentage of COVID-19 Confirmed Cases to Date by Popualtion of the State',
geo= dict(scope='usa',
showlakes=True, # lakes
lakecolor='rgb(255, 255, 255)'))
map_cases = go.Figure(map_data)
map_cases.update_layout(map_layout)
map_cases.show()
map_cases.write_html("state_rate.html")
#Storing data by Month to use for the heat map
#need to run merge before this
def structureDataTimeSeries(confirmed_cases):
#drop the columns we don't need and store in a new dataframe
df_confirm = confirmed_cases.drop(columns=['population'])
#Unpivot date columns with variable column ‘Date’ and value column ‘Confirmed’
dates = df_confirm.columns[5:]
#use melt function to make data long
df_confirm_long = df_confirm.melt(id_vars='State', value_vars=dates, var_name='Date', value_name='Confirmed')
#aggregating by date and state and resetting
df_confirm_long = df_confirm_long.groupby(['Date', 'State'])['Confirmed'].sum().reset_index()
#converting date strings to datetime
df_confirm_long['Date'] = pd.to_datetime(df_confirm_long['Date'])
return df_confirm_long
#need to run merge, arrange, structure before this
def createHeatMap(df_confirm_long):
##Getting months with dates
df_confirm_long['month'] = df_confirm_long['Date'].dt.month_name()
#We can also get data by day for further analysis if required.
#df_confirm['day'] = df_confirm['date'].dt.day_name()
#aggregating by month and resetting and storing in a new dataframe
df_month = df_confirm_long.groupby(['State', 'month'],sort=False)['Confirmed'].max().reset_index()
#saving the dataframe for back-up
df_month.to_csv('df_cases_months.csv',index=True)
#creating data for heatmap to show confirmed cases for each state by month
heatmap_data = pd.pivot_table(df_month, values='Confirmed', index='State', columns='month')
#verifying data
# print(heatmap_data.iloc[0:3, 0:3])
#store months in a list to use for sorting
months=df_month.month.unique().tolist()
#sort columns by months
heatmap_data = heatmap_data.reindex(columns=months)
#customize the heatmap
plt.figure(figsize=(200,200))
m = sns.heatmap(heatmap_data,cmap='RdBu_r', robust=True)
m.set_xticklabels(heatmap_data.columns, rotation=45)
m.set_yticklabels(heatmap_data.index, rotation=45)
m.set_title('Confirmed Cases by month for all the States in USA')
#save the heatmap
plt.savefig('heatmap.png', dpi=150)
plt.show()
plt.savefig("heatmapByState.png")
return plt
def createLinePlot(df_confirm_long):
# Initialize figure and ax
fig, ax = plt.subplots()
states=['TX','CA','FL','AZ','NY']
compare = df_confirm_long[df_confirm_long['State'].isin(states) ]
#set parameter markers to True and style='continent'
ax.set(yscale="log")
ax.set_title("COVID-19 Cases Trends for AZ, CA, FL, NY, TX")
sns.lineplot(x='Date', y='Confirmed', data=compare, hue='State', ax=ax, style = 'State', markers = True)
plt.show()
def plot_geomap_counties(cases, deaths):
# get the max cases for each county. Want to get the max because cases are aggregate, not single day,
# so cases on latest date is total number of cases thusfar
last_date= cases.columns.tolist()[-1]
max_cases = cases[['countyFIPS','County Name','State',f'{last_date}']]
deaths = deaths[['countyFIPS',f'{last_date}']]
deaths_dict = dict(zip(deaths['countyFIPS'], deaths[f'{last_date}']))
max_cases['Deaths'] = max_cases['countyFIPS'].map(deaths_dict)
# code to plot a heatmap of the number of cases using plotly
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
# need to add leading zero to all fips that have 4 characters, because that is how it is stored in df pulled with county fips
# create empty dictionary for new fips values
fips_new = {}
#make fips str so we can add leading zeros and make into list we can loop through
fips = cases['countyFIPS'].astype(str).tolist()
# loop through fips list and if length is 4 add a leading zero. Add all updated fips to dict, with key as old fip and
# value as new fip
for f in fips:
if len(f) == 4:
f_new = '0' + f
fips_new[f] = f_new
else:
fips_new[f] = f
# make fips str
max_cases['countyFIPS'] = max_cases['countyFIPS'].astype(str)
# map dictionary as new column so that we can use the new fips that we created
max_cases['fips'] = max_cases['countyFIPS'].map(fips_new)
# get all cases where county fip is not zero. Dont want statewide data in this instance
max_cases = max_cases.loc[max_cases['countyFIPS'] != 0]
# plot the heatmap of cases
fig = px.choropleth_mapbox(max_cases, geojson=counties, locations='fips', color=f'{last_date}',
# color_continuous_scale="tempo",
range_color=(1, 5000),
mapbox_style="carto-positron",
zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
opacity=0.5,
labels={f'{last_date}':'# Cases', 'County Name':'County: ', 'Deaths': '# Deaths'},
hover_data=['County Name', 'Deaths']
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
def plot_bar(cases, deaths):
# get the max cases for each county. Want to get the max because cases are aggregate, not single day,
# so cases on latest date is total number of cases thusfar
last_date= cases.columns.tolist()[-1]
max_cases = cases[['countyFIPS','County Name','State',f'{last_date}']]
deaths = deaths[['countyFIPS',f'{last_date}']]
deaths_dict = dict(zip(deaths['countyFIPS'], deaths[f'{last_date}']))
max_cases['Deaths'] = max_cases['countyFIPS'].map(deaths_dict)
state_counts = max_cases.groupby('State').sum().reset_index()
# rename the columns of each state
state_counts.columns = ['State', 'CountyFIPS','NumCases', 'NumDeaths']
# sort the state_counts by number of cases
state_counts.sort_values(by='NumCases')
# plot the number of cases in bar chart
#fig = px.bar(state_counts, x='State', y='NumCases')
#fig.update_layout(title=f'Number of COVID-19 Cases by State as of {last_date}')
#fig.show()
fig = go.Figure(data=[
go.Bar(name='# Cases', x=state_counts['State'], y=state_counts['NumCases'])
])
# Change the bar mode
fig.update_layout(barmode='group', title='Number of COVID-19 Cases by State')
fig.show()
fig2 = go.Figure(data=[
go.Bar(name='# Cases', x=state_counts['State'], y=state_counts['NumDeaths'])
])
# Change the bar mode
fig2.update_layout(barmode='group', title='Number of COVID-19 Deaths by State')
fig2.show()
"""
Function to get the differences in cases for the entire dataset
"""
def get_diff_by_state(cases, state_name):
# add try except block here. Also room for testing to make sure that correct state chosen
state = cases.loc[cases['State'] == str(state_name)]
# transpose dataframe so dates are rows
stateT = state.T
# make county names the columns for transposed data frame
stateT.columns=stateT.iloc[1]
# drop statewide counts
stateT = stateT.drop(columns='Statewide Unallocated')
# drop the first five rows because that is data we are not interested in in this instance
stateT = stateT.iloc[5:]
# create empty dictionary so we can store the difference in cases for each county in state
diff_counts = {}
for col in stateT:
# get the column and find the difference for each row
col_cases = stateT[col].diff()
# append new series to dictionary as value with key as county name
diff_counts[col] = col_cases
# create dataframe with differences from each state by day
diff_counties = pd.DataFrame.from_dict(diff_counts)
return diff_counties
"""
Function to get the differences in cases for the last week
"""
def get_diff_last_week(cases, state_name):
# add try except block here. Also room for testing to make sure that correct state chosen
state = cases.loc[cases['State'] == state_name]
# transpose dataframe so dates are rows
stateT = state.T
# make county names the columns for transposed data frame
stateT.columns=stateT.iloc[1]
# drop statewide counts
stateT = stateT.drop(columns='Statewide Unallocated')
# drop the first three rows because that is data we are not interested in in this instance
stateT = stateT.iloc[5:]
# create empty dictionary so we can store the difference in cases for each county in state
diff_counts = {}
for col in stateT:
# get the column and find the difference for each row
col_cases = stateT[col].diff()
# append new series to dictionary as value with key as county name
diff_counts[col] = col_cases
# create dataframe with differences from each state by day
diff_counties = pd.DataFrame.from_dict(diff_counts)
# get the date for today so that we can update our data for last week
today = pd.to_datetime(date.today())
# get the date for a week prior so that we can filter our datetime column
week = today - pd.Timedelta(days=14)
# reset index so that we can get the Date as a column
diff_counties = diff_counties.reset_index().rename(columns={'index':'Date'})
# convert the date to pandas datetime format
diff_counties['Date'] = pd.to_datetime(diff_counties['Date'], infer_datetime_format=True)
# get the data where the date is greater than the last week
diff_counties = diff_counties.loc[diff_counties['Date'] >= week]
diff_counties.index = pd.to_datetime(diff_counties['Date'], format = '%m/%d/%Y')
diff_counties.index = diff_counties.index.strftime('%Y-%m-%d')
diff_counties = diff_counties.drop(columns=['Date'])
return diff_counties
confirmed_cases = scrape_live_data()
confirmed_deaths = scrape_live_data_deaths()

#arrange columns to produce geo map
statePOP = createStatePOP(confirmed_cases)
#melt
meltData_df = meltData(confirmed_cases)
#create cases/popualtion to create map
createRate = createRate(statePOP,meltData_df)
#structure for heat and line plot
df_confirm_long = structureDataTimeSeries(confirmed_cases)




#geomap
createUSAMap(createRate)
plot_bar(confirmed_cases, confirmed_deaths)
#Line plot
createLinePlot(df_confirm_long)


plot_geomap_counties(confirmed_cases, confirmed_deaths)




